import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import itertools
import scipy.stats as stats
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
# Load the dataset
df_crime = pd.read_csv('1.estimated_crimes_1979_2022.csv')
# Filter data for the years 2018 to 2022
df_crime = df_crime[(df_crime['year'] == 2018) | (df_crime['year'] == 2019) | (df_crime['year'] == 2020) | (df_crime['year'] == 2021) | (df_crime['year'] == 2022)]
# Calculate the total offenses by summing specific columns (assumed to be offense types)
df_crime['total_offenses'] = df_crime.iloc[:, 4:14].sum(axis=1)
# Select only relevant columns: year, state abbreviation, state name, population, and total offenses
df_crime = df_crime[['year', 'state_abbr', 'state_name', 'population', 'total_offenses']]
# Group by state abbreviation and state name, then calculate the sum of population and total offenses for each state
df_crime_group = df_crime.groupby(['state_abbr', 'state_name']).sum()[['population', 'total_offenses']]
# Reset the index of the grouped data for easier manipulation
df_crime = df_crime_group.reset_index()
# Calculate offense rate per person by dividing total offenses by the population
df_crime['offense_rate_per_person'] = df_crime['total_offenses'] / df_crime['population']
# Set the index to state_name for easy reference later
df_crime.set_index('state_name', inplace=True)
# Print the number of states in the dataset
print(len(df_crime))
# Display the first 3 rows of the dataframe
df_crime.head(3)
51
| state_abbr | population | total_offenses | offense_rate_per_person | |
|---|---|---|---|---|
| state_name | ||||
| Alaska | AK | 3667665 | 237826.0 | 0.064844 |
| Alabama | AL | 24841320 | 1292214.0 | 0.052019 |
| Arkansas | AR | 15134999 | 1002778.0 | 0.066256 |
# Load the dataset
df_emp_income = pd.read_csv('2.Emplyment_status_and_income_by_state.csv')
# Set the index to 'Label (Grouping)' for easier data manipulation
df_emp_income.set_index('Label (Grouping)', inplace=True)
# Transpose the dataframe, select every 4th row starting from the first, and select specific columns for analysis
df_emp_income = df_emp_income.T.iloc[0::4, [1, 2, 5, 68]]
# Clean the index by splitting the string at '!!' and keeping only the first part (state names)
df_emp_income.index = df_emp_income.index.str.split('!!').str[0]
# Rename the columns for better understanding
df_emp_income.columns = ['total_population_over_16', 'total_labor_force', 'unemployment', 'mean_household_income']
# Remove commas from the numeric data for conversion to integer
df_emp_income = df_emp_income.replace({',': ''}, regex=True)
# Convert all values to integers
df_emp_income = df_emp_income.astype(int)
# Calculate the unemployment rate by dividing unemployment by total labor force
df_emp_income['unemployment_rate'] = df_emp_income['unemployment'] / df_emp_income['total_labor_force']
# Sort the dataframe by index (state names)
df_emp_income = df_emp_income.sort_index()
# Print the number of rows (states) in the dataframe
print(len(df_emp_income))
# Display the first 3 rows of the dataframe
df_emp_income.head(3)
52
| total_population_over_16 | total_labor_force | unemployment | mean_household_income | unemployment_rate | |
|---|---|---|---|---|---|
| Alabama | 4046614 | 2345086 | 120030 | 82992 | 0.051184 |
| Alaska | 573998 | 383078 | 23035 | 110602 | 0.060131 |
| Arizona | 5764417 | 3490030 | 186058 | 98569 | 0.053311 |
# Load the dataset
df_high_edu = pd.read_csv('3.Educational_attainment_by_state_and_income.csv')
# Set the index to 'Label (Grouping)' for easier manipulation
df_high_edu.set_index('Label (Grouping)', inplace=True)
# Transpose the dataframe, select every 12th row starting from the first, and choose specific columns for analysis
df_high_edu = df_high_edu.T.iloc[0::12, list(range(1, 14)) + list(range(63, 68))]
# Rename the columns for clarity
df_high_edu.columns = ['population_18-24', '18-24_less_than_high_school', '18-24_high_school', '18-24_higher_than_high_school_1',
'18-24_higher_than_high_school_2', 'population_over_25', '25_over_less_than_high_school_1',
'25_over_less_than_high_school_2', '25_over_high_school', '25_over_higher_than_high_school_1',
'25_over_higher_than_high_school_2', '25_over_higher_than_high_school_3', '25_over_higher_than_high_school_4',
'income_less_than_high_school', 'income_high_school', 'income_higher_than_high_school_1',
'income_higher_than_high_school_2', 'income_higher_than_high_school_3']
# Remove commas from numeric data for conversion to integers
df_high_edu = df_high_edu.replace({',': ''}, regex=True)
# Convert all values to integers
df_high_edu = df_high_edu.astype(int)
# Clean the index by splitting at '!!' and keeping only the first part (state names)
df_high_edu.index = df_high_edu.index.str.split('!!').str[0]
# Sort the dataframe by index (state names)
df_high_edu = df_high_edu.sort_index()
# Calculate the total population by summing the 18-24 and over 25 age groups
df_high_edu['population_total'] = df_high_edu['population_18-24'] + df_high_edu['population_over_25']
# Calculate the population with less than a high school education
df_high_edu['population_less_than_high_school'] = df_high_edu['18-24_less_than_high_school'] + df_high_edu['25_over_less_than_high_school_1'] + df_high_edu['25_over_less_than_high_school_2']
# Calculate the population with a high school education
df_high_edu['population_high_school'] = df_high_edu['18-24_high_school'] + df_high_edu['25_over_high_school']
# Calculate the population with higher than a high school education
df_high_edu['population_higher_than_high_school'] = df_high_edu['18-24_higher_than_high_school_1'] + df_high_edu['18-24_higher_than_high_school_2'] +\
df_high_edu['25_over_higher_than_high_school_1'] + df_high_edu['25_over_higher_than_high_school_2'] +\
df_high_edu['25_over_higher_than_high_school_3'] + df_high_edu['25_over_higher_than_high_school_4']
# Calculate the average income for those with higher than a high school education
df_high_edu['income_higher_than_high_school'] = (df_high_edu['income_higher_than_high_school_1'] + df_high_edu['income_higher_than_high_school_2'] +\
df_high_edu['income_higher_than_high_school_3']) / 3
# Drop unnecessary columns from the dataframe
df_high_edu.drop(columns=['population_18-24', '18-24_less_than_high_school', '18-24_high_school',
'18-24_higher_than_high_school_1', '18-24_higher_than_high_school_2',
'population_over_25', '25_over_less_than_high_school_1',
'25_over_less_than_high_school_2', '25_over_high_school',
'25_over_higher_than_high_school_1',
'25_over_higher_than_high_school_2',
'25_over_higher_than_high_school_3',
'25_over_higher_than_high_school_4', 'income_higher_than_high_school_1',
'income_higher_than_high_school_2', 'income_higher_than_high_school_3'], inplace=True)
# Calculate the percentage of population with less than a high school education
df_high_edu['perc_less_than_high_school'] = df_high_edu['population_less_than_high_school'] / df_high_edu['population_total']
# Calculate the percentage of population with a high school education
df_high_edu['perc_high_school'] = df_high_edu['population_high_school'] / df_high_edu['population_total']
# Calculate the percentage of population with higher than a high school education
df_high_edu['perc_higher_than_high_school'] = df_high_edu['population_higher_than_high_school'] / df_high_edu['population_total']
# Select only the relevant columns for analysis
df_high_edu = df_high_edu[['perc_less_than_high_school','perc_high_school','perc_higher_than_high_school',
'income_less_than_high_school', 'income_high_school','income_higher_than_high_school']]
# Normalize income values by dividing by 100,000
df_high_edu[['income_less_than_high_school', 'income_high_school','income_higher_than_high_school']] = df_high_edu[['income_less_than_high_school', 'income_high_school','income_higher_than_high_school']] / 100000
# Print the number of rows (states) in the dataframe
print(len(df_high_edu))
# Display the first 3 rows of the dataframe
df_high_edu.head(3)
52
| perc_less_than_high_school | perc_high_school | perc_higher_than_high_school | income_less_than_high_school | income_high_school | income_higher_than_high_school | |
|---|---|---|---|---|---|---|
| Alabama | 0.122396 | 0.310244 | 0.567360 | 0.27789 | 0.34470 | 0.560773 |
| Alaska | 0.072740 | 0.310236 | 0.617023 | 0.30804 | 0.41338 | 0.686773 |
| Arizona | 0.116928 | 0.250149 | 0.632923 | 0.30398 | 0.36792 | 0.613287 |
# Load the dataset
df_poverty = pd.read_csv('4.poverty_rate_by_state.csv')
# Set the index to 'Label (Grouping)' for easier manipulation
df_poverty.set_index('Label (Grouping)', inplace=True)
# Transpose the dataframe and select every 6th row starting from the 5th, and only the first column (poverty rate)
df_poverty = df_poverty.T.iloc[4::6, 0:1]
# Clean the index by splitting at '!!' and keeping only the first part (state names)
df_poverty.index = df_poverty.index.str.split('!!').str[0]
# Rename the column for clarity
df_poverty.columns = ['poverty_rate']
# Remove percentage symbols from the poverty rate data
df_poverty = df_poverty.replace({'%': ''}, regex=True)
# Convert the poverty rate to a float and divide by 100 to express it as a decimal
df_poverty = df_poverty.astype(float) / 100
# Sort the dataframe by state names (index)
df_poverty = df_poverty.sort_index()
# Print the number of rows (states) in the dataframe
print(len(df_poverty))
# Display the first 3 rows of the dataframe
df_poverty.head(3)
52
| poverty_rate | |
|---|---|
| Alabama | 0.157 |
| Alaska | 0.105 |
| Arizona | 0.131 |
# Load the dataset
df_gini = pd.read_csv('5.Gini_Index_of_Income_Inequality.csv')
# Set the index to 'Label (Grouping)' for easier data manipulation
df_gini.set_index('Label (Grouping)', inplace=True)
# Transpose the dataframe and select every 2nd row starting from the first, and only the first column (Gini Index)
df_gini = df_gini.T.iloc[0::2, 0:1]
# Clean the index by splitting at '!!' and keeping only the first part (state names)
df_gini.index = df_gini.index.str.split('!!').str[0]
# Rename the column for clarity
df_gini.columns = ['Gini_Index']
# Convert the Gini Index values to float type
df_gini = df_gini.astype(float)
# Sort the dataframe by state names (index)
df_gini = df_gini.sort_index()
# Print the number of rows (states) in the dataframe
print(len(df_gini))
# Display the first 3 rows of the dataframe
df_gini.head(3)
52
| Gini_Index | |
|---|---|
| Alabama | 0.4797 |
| Alaska | 0.4304 |
| Arizona | 0.4610 |
# SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME (SMOCAPI)
# GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI)
# Load the dataset
df_affordability = pd.read_csv('6.Housing_affordability_by_state.csv')
# Set the index to 'Label (Grouping)' for easier data manipulation
df_affordability.set_index('Label (Grouping)', inplace=True)
# Transpose the dataframe, select every 4th row starting from the first, and select specific columns for analysis
df_affordability = df_affordability.T.iloc[0::4, [1,128,129,137,138,157,158]]
# Clean the index by splitting at '!!' and keeping only the first part (state names)
df_affordability.index = df_affordability.index.str.split('!!').str[0]
# Sort the dataframe by state names (index)
df_affordability = df_affordability.sort_index()
# Remove commas from the numeric data and convert to integers
df_affordability = df_affordability.replace({',': ''}, regex=True).astype(int)
# Calculate the total number of SMOCAPI housing units by summing selected columns
df_affordability['SMOCAPI'] = df_affordability.iloc[:, 1] + df_affordability.iloc[:, 2] + df_affordability.iloc[:, 3] + df_affordability.iloc[:, 4]
# Calculate the total number of GRAPI housing units by summing selected columns
df_affordability['GRAPI'] = df_affordability.iloc[:, 5] + df_affordability.iloc[:, 6]
# Drop unnecessary columns
df_affordability.drop(df_affordability.columns[1:7], axis=1, inplace=True)
# Rename the remaining columns for clarity
df_affordability.columns = ['Total_housing_units', 'SMOCAPI', 'GRAPI']
# Calculate the SMOCAPI rate as a percentage of total housing units
df_affordability['SMOCAPI_rate'] = df_affordability['SMOCAPI'] / df_affordability['Total_housing_units']
# Calculate the GRAPI rate as a percentage of total housing units
df_affordability['GRAPI_rate'] = df_affordability['GRAPI'] / df_affordability['Total_housing_units']
# Select only the relevant columns for further analysis
df_affordability = df_affordability[['SMOCAPI_rate', 'GRAPI_rate']]
# Print the number of rows (states) in the dataframe
print(len(df_affordability))
# Display the first 3 rows of the dataframe
df_affordability.head(3)
52
| SMOCAPI_rate | GRAPI_rate | |
|---|---|---|
| Alabama | 0.100562 | 0.107019 |
| Alaska | 0.119716 | 0.112095 |
| Arizona | 0.120978 | 0.136437 |
# Load the dataset
df_insurance_raw = pd.read_csv('7.Health_insurance_coverage_by_state.csv')
# Set the index to 'Label (Grouping)' for easier data manipulation
df_insurance_raw.set_index('Label (Grouping)', inplace=True)
# Extract the insured rate data by transposing the dataframe and selecting every 10th row starting from the 5th
df_insurance_insured = df_insurance_raw.T.iloc[4::10, 0:1]
df_insurance_insured.columns = ['insured_rate']
# Clean the index by splitting at '!!' and keeping only the first part (state names)
df_insurance_insured.index = df_insurance_insured.index.str.split('!!').str[0]
# Extract the uninsured rate data by transposing the dataframe and selecting every 10th row starting from the 9th
df_insurance_uninsured = df_insurance_raw.T.iloc[8::10, 0:1]
df_insurance_uninsured.columns = ['uninsured_rate']
# Clean the index for uninsured rate data
df_insurance_uninsured.index = df_insurance_uninsured.index.str.split('!!').str[0]
# Merge the insured and uninsured data on the state names (index)
df_insurance = pd.merge(df_insurance_insured, df_insurance_uninsured, left_index=True, right_index=True, how='outer')
# Remove percentage symbols and convert the rates to decimal format by dividing by 100
df_insurance = df_insurance.replace({'%': ''}, regex=True)
df_insurance = df_insurance.astype(float) / 100
# Sort the dataframe by state names (index)
df_insurance = df_insurance.sort_index()
# Print the number of rows (states) in the dataframe
print(len(df_insurance))
# Display the first 3 rows of the dataframe
df_insurance.head(3)
52
| insured_rate | uninsured_rate | |
|---|---|---|
| Alabama | 0.905 | 0.095 |
| Alaska | 0.883 | 0.117 |
| Arizona | 0.892 | 0.108 |
# To show the length of each dataFrame
print(len(df_crime))
print(len(df_emp_income))
print(len(df_high_edu))
print(len(df_poverty))
print(len(df_gini))
print(len(df_affordability))
print(len(df_insurance))
51 52 52 52 52 52 52
# Merge the employment and income data with the education data
df_merged = pd.merge(df_emp_income, df_high_edu, left_index=True, right_index=True, how='outer')
# Merge the result with crime data
df_merged = pd.merge(df_merged, df_crime, left_index=True, right_index=True, how='outer')
# Merge the result with poverty data
df_merged = pd.merge(df_merged, df_poverty, left_index=True, right_index=True, how='outer')
# Merge the result with Gini index data
df_merged = pd.merge(df_merged, df_gini, left_index=True, right_index=True, how='outer')
# Merge the result with housing affordability data
df_merged = pd.merge(df_merged, df_affordability, left_index=True, right_index=True, how='outer')
# Merge the result with health insurance data
df_merged = pd.merge(df_merged, df_insurance, left_index=True, right_index=True, how='outer')
# Drop any rows with missing values (NaN)
df_merged = df_merged.dropna()
# Select only relevant columns for further analysis
df_merged = df_merged[['unemployment_rate', 'perc_less_than_high_school',
'perc_high_school', 'perc_higher_than_high_school',
'income_less_than_high_school', 'income_high_school',
'income_higher_than_high_school', 'poverty_rate', 'Gini_Index',
'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate',
'offense_rate_per_person']]
# Print the number of rows in the merged dataframe
print('Len of dataFrame : ', len(df_merged))
# Print the number of columns in the merged dataframe
print('Len of columns : ', len(df_merged.columns))
# Display the first 3 rows of the merged dataframe
df_merged.head(3)
Len of dataFrame : 51 Len of columns : 13
| unemployment_rate | perc_less_than_high_school | perc_high_school | perc_higher_than_high_school | income_less_than_high_school | income_high_school | income_higher_than_high_school | poverty_rate | Gini_Index | uninsured_rate | SMOCAPI_rate | GRAPI_rate | offense_rate_per_person | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | 0.051184 | 0.122396 | 0.310244 | 0.567360 | 0.27789 | 0.34470 | 0.560773 | 0.157 | 0.4797 | 0.095 | 0.100562 | 0.107019 | 0.052019 |
| Alaska | 0.060131 | 0.072740 | 0.310236 | 0.617023 | 0.30804 | 0.41338 | 0.686773 | 0.105 | 0.4304 | 0.117 | 0.119716 | 0.112095 | 0.064844 |
| Arizona | 0.053311 | 0.116928 | 0.250149 | 0.632923 | 0.30398 | 0.36792 | 0.613287 | 0.131 | 0.4610 | 0.108 | 0.120978 | 0.136437 | 0.055133 |
# Create a copy of the merged dataframe to avoid altering the original data
df = df_merged.copy()
# Reset the index of the dataframe and convert the index into a column
df = df.reset_index()
# Rename the 'index' column to 'state' for clarity
df.rename(columns={'index': 'state'}, inplace=True)
# Pop (remove) the 'offense_rate_per_person' column and save it in a variable
offense_col = df.pop('offense_rate_per_person')
# Re-insert the 'offense_rate_per_person' column at the end of the dataframe
df['offense_rate_per_person'] = offense_col
# Display the modified dataframe
df
| state | unemployment_rate | perc_less_than_high_school | perc_high_school | perc_higher_than_high_school | income_less_than_high_school | income_high_school | income_higher_than_high_school | poverty_rate | Gini_Index | uninsured_rate | SMOCAPI_rate | GRAPI_rate | offense_rate_per_person | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Alabama | 0.051184 | 0.122396 | 0.310244 | 0.567360 | 0.27789 | 0.34470 | 0.560773 | 0.157 | 0.4797 | 0.095 | 0.100562 | 0.107019 | 0.052019 |
| 1 | Alaska | 0.060131 | 0.072740 | 0.310236 | 0.617023 | 0.30804 | 0.41338 | 0.686773 | 0.105 | 0.4304 | 0.117 | 0.119716 | 0.112095 | 0.064844 |
| 2 | Arizona | 0.053311 | 0.116928 | 0.250149 | 0.632923 | 0.30398 | 0.36792 | 0.613287 | 0.131 | 0.4610 | 0.108 | 0.120978 | 0.136437 | 0.055133 |
| 3 | Arkansas | 0.051251 | 0.118151 | 0.344764 | 0.537085 | 0.29018 | 0.34070 | 0.543163 | 0.162 | 0.4821 | 0.088 | 0.092819 | 0.113377 | 0.066256 |
| 4 | California | 0.063567 | 0.148776 | 0.219887 | 0.631338 | 0.29750 | 0.38446 | 0.757200 | 0.121 | 0.4895 | 0.071 | 0.156843 | 0.211474 | 0.054726 |
| 5 | Colorado | 0.044763 | 0.080199 | 0.219769 | 0.700032 | 0.35024 | 0.41596 | 0.656833 | 0.096 | 0.4560 | 0.078 | 0.140562 | 0.152426 | 0.066578 |
| 6 | Connecticut | 0.058839 | 0.087626 | 0.262988 | 0.649386 | 0.29911 | 0.42224 | 0.729250 | 0.101 | 0.4992 | 0.052 | 0.163324 | 0.150156 | 0.034259 |
| 7 | Delaware | 0.054011 | 0.092000 | 0.303230 | 0.604769 | 0.32502 | 0.38456 | 0.641317 | 0.111 | 0.4508 | 0.060 | 0.130143 | 0.111302 | 0.049750 |
| 8 | District of Columbia | 0.070496 | 0.073721 | 0.162478 | 0.763801 | 0.29851 | 0.36093 | 0.835520 | 0.151 | 0.5171 | 0.034 | 0.081585 | 0.232981 | 0.098973 |
| 9 | Florida | 0.050039 | 0.109878 | 0.282792 | 0.607330 | 0.27504 | 0.33773 | 0.562573 | 0.129 | 0.4858 | 0.123 | 0.143659 | 0.150390 | 0.044021 |
| 10 | Georgia | 0.051910 | 0.116976 | 0.281506 | 0.601518 | 0.29323 | 0.35289 | 0.612247 | 0.135 | 0.4787 | 0.129 | 0.114716 | 0.145193 | 0.049890 |
| 11 | Hawaii | 0.047230 | 0.073216 | 0.283717 | 0.643067 | 0.31846 | 0.40755 | 0.620697 | 0.096 | 0.4466 | 0.039 | 0.160129 | 0.169769 | 0.057764 |
| 12 | Idaho | 0.037414 | 0.090435 | 0.278929 | 0.630635 | 0.31438 | 0.35563 | 0.551033 | 0.110 | 0.4436 | 0.097 | 0.127794 | 0.104636 | 0.027531 |
| 13 | Illinois | 0.059386 | 0.100405 | 0.261298 | 0.638297 | 0.31280 | 0.37706 | 0.668587 | 0.118 | 0.4812 | 0.070 | 0.134389 | 0.134002 | 0.041265 |
| 14 | Indiana | 0.044862 | 0.105512 | 0.334103 | 0.560385 | 0.31415 | 0.37447 | 0.587603 | 0.123 | 0.4517 | 0.078 | 0.100647 | 0.116782 | 0.043017 |
| 15 | Iowa | 0.038163 | 0.074463 | 0.305128 | 0.620409 | 0.34302 | 0.38538 | 0.601263 | 0.111 | 0.4431 | 0.048 | 0.106123 | 0.103625 | 0.038517 |
| 16 | Kansas | 0.039536 | 0.086500 | 0.264961 | 0.648538 | 0.32127 | 0.36397 | 0.575753 | 0.116 | 0.4579 | 0.089 | 0.100200 | 0.121229 | 0.054017 |
| 17 | Kentucky | 0.050550 | 0.118493 | 0.333834 | 0.547673 | 0.26748 | 0.34705 | 0.549860 | 0.161 | 0.4768 | 0.059 | 0.104299 | 0.111038 | 0.039753 |
| 18 | Louisiana | 0.065198 | 0.134123 | 0.332168 | 0.533709 | 0.27176 | 0.34837 | 0.545437 | 0.187 | 0.4952 | 0.081 | 0.105982 | 0.130499 | 0.070678 |
| 19 | Maine | 0.040264 | 0.063688 | 0.314534 | 0.621778 | 0.31395 | 0.36542 | 0.551390 | 0.109 | 0.4566 | 0.071 | 0.123696 | 0.087824 | 0.026641 |
| 20 | Maryland | 0.050710 | 0.093205 | 0.247191 | 0.659604 | 0.33131 | 0.42031 | 0.760553 | 0.093 | 0.4559 | 0.059 | 0.138276 | 0.142054 | 0.043614 |
| 21 | Massachusetts | 0.053095 | 0.089142 | 0.237464 | 0.673393 | 0.33558 | 0.43492 | 0.741300 | 0.099 | 0.4878 | 0.027 | 0.149360 | 0.162842 | 0.028704 |
| 22 | Michigan | 0.060210 | 0.087036 | 0.290044 | 0.622919 | 0.28213 | 0.35052 | 0.619983 | 0.131 | 0.4642 | 0.052 | 0.119518 | 0.109060 | 0.039342 |
| 23 | Minnesota | 0.039624 | 0.069728 | 0.245277 | 0.684995 | 0.33094 | 0.39969 | 0.675037 | 0.093 | 0.4496 | 0.046 | 0.119387 | 0.110498 | 0.046225 |
| 24 | Mississippi | 0.063614 | 0.137265 | 0.300594 | 0.562141 | 0.25363 | 0.32974 | 0.502060 | 0.192 | 0.4834 | 0.118 | 0.108706 | 0.111000 | 0.047124 |
| 25 | Missouri | 0.042689 | 0.089992 | 0.308830 | 0.601178 | 0.28861 | 0.35770 | 0.570383 | 0.128 | 0.4661 | 0.095 | 0.102606 | 0.118617 | 0.060351 |
| 26 | Montana | 0.038673 | 0.063087 | 0.288783 | 0.648130 | 0.28127 | 0.34557 | 0.521763 | 0.124 | 0.4644 | 0.084 | 0.127899 | 0.107354 | 0.051872 |
| 27 | Nebraska | 0.030501 | 0.085323 | 0.260431 | 0.654245 | 0.33716 | 0.37196 | 0.588690 | 0.104 | 0.4525 | 0.078 | 0.105011 | 0.123741 | 0.045019 |
| 28 | Nevada | 0.069628 | 0.130202 | 0.292057 | 0.577741 | 0.33455 | 0.37646 | 0.593007 | 0.127 | 0.4675 | 0.114 | 0.125852 | 0.186855 | 0.054796 |
| 29 | New Hampshire | 0.035602 | 0.066112 | 0.279658 | 0.654230 | 0.35190 | 0.44342 | 0.663957 | 0.073 | 0.4426 | 0.058 | 0.153435 | 0.104770 | 0.025462 |
| 30 | New Jersey | 0.062324 | 0.094315 | 0.264983 | 0.640702 | 0.31260 | 0.39913 | 0.763823 | 0.097 | 0.4811 | 0.075 | 0.169723 | 0.159712 | 0.029687 |
| 31 | New Mexico | 0.061420 | 0.132076 | 0.272573 | 0.595351 | 0.24742 | 0.32566 | 0.544143 | 0.183 | 0.4766 | 0.095 | 0.119440 | 0.118536 | 0.076794 |
| 32 | New York | 0.062104 | 0.121565 | 0.253464 | 0.624971 | 0.28885 | 0.38037 | 0.694190 | 0.136 | 0.5147 | 0.052 | 0.130663 | 0.199795 | 0.035268 |
| 33 | North Carolina | 0.049779 | 0.107074 | 0.261838 | 0.631088 | 0.28425 | 0.34303 | 0.581203 | 0.133 | 0.4773 | 0.105 | 0.108411 | 0.127506 | 0.052351 |
| 34 | North Dakota | 0.028779 | 0.068798 | 0.262111 | 0.669090 | 0.35724 | 0.41039 | 0.593530 | 0.108 | 0.4521 | 0.072 | 0.082269 | 0.114057 | 0.047176 |
| 35 | Ohio | 0.050043 | 0.090650 | 0.329399 | 0.579951 | 0.28945 | 0.36811 | 0.616987 | 0.133 | 0.4664 | 0.064 | 0.104453 | 0.124519 | 0.044505 |
| 36 | Oklahoma | 0.048152 | 0.114988 | 0.315775 | 0.569236 | 0.28933 | 0.35412 | 0.548220 | 0.152 | 0.4670 | 0.139 | 0.095963 | 0.119695 | 0.062622 |
| 37 | Oregon | 0.055201 | 0.089290 | 0.237036 | 0.673674 | 0.31465 | 0.36722 | 0.624910 | 0.119 | 0.4610 | 0.065 | 0.143934 | 0.163292 | 0.062235 |
| 38 | Pennsylvania | 0.053624 | 0.087337 | 0.337422 | 0.575241 | 0.30857 | 0.37689 | 0.639963 | 0.118 | 0.4731 | 0.056 | 0.122428 | 0.121853 | 0.035297 |
| 39 | Rhode Island | 0.058413 | 0.104322 | 0.276012 | 0.619666 | 0.33759 | 0.41968 | 0.666443 | 0.112 | 0.4620 | 0.043 | 0.146032 | 0.150050 | 0.032005 |
| 40 | South Carolina | 0.050925 | 0.109021 | 0.292021 | 0.598958 | 0.27171 | 0.34412 | 0.558780 | 0.144 | 0.4772 | 0.102 | 0.114204 | 0.111050 | 0.064300 |
| 41 | South Dakota | 0.030850 | 0.081972 | 0.303243 | 0.614785 | 0.31206 | 0.36840 | 0.551373 | 0.123 | 0.4450 | 0.095 | 0.103360 | 0.098451 | 0.043729 |
| 42 | Tennessee | 0.050093 | 0.107346 | 0.321318 | 0.571336 | 0.28958 | 0.34967 | 0.566773 | 0.140 | 0.4762 | 0.101 | 0.106675 | 0.127207 | 0.062968 |
| 43 | Texas | 0.051812 | 0.146414 | 0.259022 | 0.594564 | 0.28921 | 0.35753 | 0.641707 | 0.139 | 0.4761 | 0.176 | 0.118124 | 0.157229 | 0.054657 |
| 44 | Utah | 0.033400 | 0.074573 | 0.244436 | 0.680991 | 0.34583 | 0.39805 | 0.640380 | 0.085 | 0.4286 | 0.088 | 0.124380 | 0.116449 | 0.049055 |
| 45 | Vermont | 0.038279 | 0.062097 | 0.283682 | 0.654221 | 0.33274 | 0.39151 | 0.559820 | 0.104 | 0.4486 | 0.041 | 0.142091 | 0.102209 | 0.032062 |
| 46 | Virginia | 0.043099 | 0.089986 | 0.251513 | 0.658502 | 0.29521 | 0.38032 | 0.717213 | 0.100 | 0.4724 | 0.074 | 0.121758 | 0.134132 | 0.036044 |
| 47 | Washington | 0.049120 | 0.084966 | 0.231909 | 0.683125 | 0.34397 | 0.42614 | 0.746087 | 0.099 | 0.4637 | 0.064 | 0.139619 | 0.155443 | 0.065799 |
| 48 | West Virginia | 0.060428 | 0.115132 | 0.395372 | 0.489496 | 0.25606 | 0.35146 | 0.521940 | 0.168 | 0.4715 | 0.064 | 0.089368 | 0.086279 | 0.034406 |
| 49 | Wisconsin | 0.033578 | 0.073575 | 0.304380 | 0.622045 | 0.33617 | 0.38969 | 0.611913 | 0.107 | 0.4436 | 0.054 | 0.109131 | 0.116285 | 0.035973 |
| 50 | Wyoming | 0.037829 | 0.067692 | 0.289116 | 0.643192 | 0.31041 | 0.40358 | 0.569480 | 0.107 | 0.4476 | 0.116 | 0.115009 | 0.093669 | 0.037496 |
# State names and their corresponding codes mapping
state_code_mapping = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
'District of Columbia': 'DC'
}
# Add a column for state abbreviations (assuming 'state' column contains state names)
df['state_abbv'] = df.state.map(state_code_mapping)
# List of columns to plot
columns_to_plot = [
'unemployment_rate', 'perc_less_than_high_school', 'perc_high_school',
'perc_higher_than_high_school', 'income_less_than_high_school',
'income_high_school', 'income_higher_than_high_school', 'poverty_rate',
'Gini_Index', 'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate'
]
# Generate choropleth maps for each column in the list
for col in columns_to_plot:
fig = px.choropleth(df,
locations='state_abbv', # Use state abbreviations for location
locationmode='USA-states', # Set the location mode to USA states
color=col, # Dynamically change the color scale based on the column
hover_name='state', # Display state names on hover
color_continuous_scale='Reds', # Set the color scale to "Reds"
scope="usa") # Limit the scope to the USA map
# Update the layout with a title based on the column name
fig.update_layout(title_text=f'US State-wise {col.replace("_", " ").title()}')
fig.show() # Display the figure
import matplotlib.pyplot as plt
import seaborn as sns
# List of columns to use as x-axis in scatter plots
columns_to_plot = [
'unemployment_rate', 'perc_less_than_high_school', 'perc_high_school',
'perc_higher_than_high_school', 'income_less_than_high_school',
'income_high_school', 'income_higher_than_high_school', 'poverty_rate',
'Gini_Index', 'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate'
]
# Set up a 4x4 subplot grid
fig, axes = plt.subplots(4, 3, figsize=(20, 20))
fig.subplots_adjust(hspace=0.4, wspace=0.4) # Adjust spacing between subplots
# Dictionary to store correlation coefficients for each column
correlations = {}
# Loop through each column and plot a scatterplot in the respective subplot
for i, col in enumerate(columns_to_plot):
ax = axes[i // 3, i % 3] # Select the appropriate subplot in the 4x4 grid
sns.scatterplot(x=col, y='offense_rate_per_person', data=df, ax=ax) # Plot scatterplot
# Set title for each subplot
ax.set_title(f'{col} vs. Offense Rate')
# Add state abbreviation labels for each data point
for j in range(df.shape[0]):
ax.text(df[col].iloc[j],
df['offense_rate_per_person'].iloc[j],
df['state_abbv'].iloc[j],
fontsize=9, ha='left')
# Calculate correlation between the column and offense rate
correlation = df[col].corr(df['offense_rate_per_person'])
correlations[col] = correlation # Store the correlation coefficient
# Display the correlation coefficient below each subplot
ax.text(0.5, -0.2, f'Correlation: {correlation:.4f}',
ha='center', va='center', transform=ax.transAxes, fontsize=10)
# Show the entire plot with all subplots
plt.show()
# List of columns to analyze (excluding offense_rate_per_person and state-related columns)
columns_to_analyze = [
'unemployment_rate', 'perc_less_than_high_school', 'perc_high_school',
'perc_higher_than_high_school', 'income_less_than_high_school',
'income_high_school', 'income_higher_than_high_school', 'poverty_rate',
'Gini_Index', 'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate'
]
# Set up a list of colors to cycle through for the bar plots
colors = itertools.cycle(['blue', 'green', 'red', 'purple', 'orange', 'brown', 'pink', 'gray', 'cyan'])
# Create subplots with a 3x4 grid structure
fig, axs = plt.subplots(3, 4, figsize=(20, 16))
# Adjust the spacing between subplots
plt.subplots_adjust(hspace=0.5, wspace=0.5) # Increase hspace and wspace for better spacing
# Loop through each column to calculate and visualize the average offense rate for the top and bottom 10%
for i, column in enumerate(columns_to_analyze):
row = i // 4 # Calculate the row index for the subplot
col = i % 4 # Calculate the column index for the subplot
# Filter states with column values in the top 10% (high) and bottom 10% (low)
high_states = df[df[column] > df[column].quantile(0.90)]
low_states = df[df[column] < df[column].quantile(0.10)]
# Calculate the mean offense rate for high and low groups
high_offense_rate = high_states['offense_rate_per_person'].mean()
low_offense_rate = low_states['offense_rate_per_person'].mean()
# Create a dictionary to store the average offense rates for high and low groups
average_offense_rates = {
f'High Group': high_offense_rate,
f'Low Group': low_offense_rate
}
# Set the color for the bars (use different color for each pair)
color = next(colors)
bars = axs[row, col].bar(average_offense_rates.keys(), average_offense_rates.values(), color=[color, color])
# Set the title and labels for each subplot
axs[row, col].set_title(f'{column} vs. Offense Rate')
axs[row, col].set_ylabel('Average Offense Rate per Person')
axs[row, col].tick_params(axis='x', rotation=45) # Rotate the x-axis labels for better readability
# Perform t-test
t_stat, p_value = stats.ttest_ind(high_states['offense_rate_per_person'], low_states['offense_rate_per_person'])
# Display T-stat on the plot, slightly above the high and low group labels
axs[row, col].text(0.45, -0.2, f'T-Stat: {t_stat:.2f}', ha='center', va='center',
transform=axs[row, col].transAxes, fontsize=12)
# Add labels on top of each bar (offense rate value)
for bar in bars:
yval = bar.get_height() # Get the height of the bar (value)
axs[row, col].text(bar.get_x() + bar.get_width()/2, yval, round(yval, 4), ha='center', va='bottom')
# Adjust the layout of the entire plot for better spacing between subplots
plt.show()
# Prepare the dataframe for correlation analysis by dropping 'state' and 'state_abbv' columns
df_corr_prepare = df.drop(['state', 'state_abbv'], axis=1)
df_corr_prepare = df[['unemployment_rate', 'perc_less_than_high_school', 'perc_high_school',
'perc_higher_than_high_school', 'income_less_than_high_school',
'income_high_school', 'income_higher_than_high_school', 'poverty_rate',
'Gini_Index', 'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate', 'offense_rate_per_person']]
# Calculate the Pearson correlation matrix
corr_matrix = df_corr_prepare.corr(method='pearson')
# Create a figure for the heatmap with a defined size
plt.figure(figsize=(20, 12))
# Plot the heatmap for the correlation matrix, with annotations and a coolwarm color scheme
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
# Set the title for the heatmap
plt.title('Correlation Matrix')
# Display the heatmap
plt.show()
corr_matrix[corr_matrix.index == 'offense_rate_per_person']
| unemployment_rate | perc_less_than_high_school | perc_high_school | perc_higher_than_high_school | income_less_than_high_school | income_high_school | income_higher_than_high_school | poverty_rate | Gini_Index | uninsured_rate | SMOCAPI_rate | GRAPI_rate | offense_rate_per_person | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| offense_rate_per_person | 0.329029 | 0.240761 | -0.251529 | 0.089909 | -0.256004 | -0.326638 | 0.037118 | 0.433254 | 0.252415 | 0.260131 | -0.343365 | 0.353607 | 1.0 |
Income level and crime rate</br> The correlation coefficient between income_less_than_high_school and offense_rate_per_person is -0.26. This shows a negative correlation between income by education level and crime rate. Interpreting this in reverse, it means that the lower the income by education level, the higher the crime rate. Therefore, the hypothesis that "lower income levels lead to higher crime rates" is partially supported.
Education level and crime rate</br> The correlation coefficient between perc_less_than_high_school and offense_rate_per_person is 0.24. In addition, through quantile analysis comparing the top 10% and bottom 10% of cities, the standard deviation difference (std diff.) is confirmed to be 3.56. The individual analysis across four years also shows a consistent positive correlation. This suggests that lower education levels lead to higher crime rates, indicating that the lack of educational opportunities can influence the rise in crime. Therefore, the hypothesis that "lower education levels lead to higher crime rates" is strongly supported.
Unemployment rate and crime rate</br> The correlation coefficient between unemployment_rate and offense_rate_per_person is 0.33, indicating a positive correlation. The standard deviation difference (std diff.) between the top and bottom 10% in the quantile analysis is 2.21, which is quite significant. The four-year individual analysis also shows a consistent positive correlation. This suggests that higher unemployment rates are associated with higher crime rates, indicating that economic hardship may contribute to crime. Therefore, the hypothesis that "higher unemployment rates lead to higher crime rates" is strongly supported.
Poverty and crime rate</br> The correlation coefficient between poverty_rate and offense_rate_per_person is 0.43, the highest among the indicators analyzed. The standard deviation difference (std diff.) between the top and bottom 10% of states is confirmed to be 1.78, which is significantly high. A consistent positive correlation is also observed in the four-year individual analysis. This suggests that higher poverty rates are linked to higher crime rates, indicating that economic hardship directly impacts crime. Therefore, the hypothesis that "higher poverty leads to higher crime rates" is strongly supported.
Income inequality and crime rate</br> The Gini Index measures income inequality, where 1 indicates high inequality and 0 indicates equality. The correlation coefficient between the Gini Index and offense_rate_per_person is 0.25, showing a positive correlation, and the standard deviation difference (std diff.) in the quantile analysis is confirmed to be 0.92. This suggests that regions with greater income inequality have higher crime rates, showing the impact of economic disparity on crime. Therefore, the hypothesis that "greater income inequality leads to higher crime rates" is partially supported.
Housing affordability and crime rate</br> Two indicators were used for housing affordability:
Health insurance coverage and crime rate</br> The correlation coefficient between uninsured_rate and offense_rate_per_person is 0.26, showing a positive correlation. The standard deviation difference (std diff.) between the top and bottom 10% of states in terms of crime rates is -0.13. This indicates that regions with higher uninsured rates tend to have higher crime rates, suggesting that lack of access to medical services may increase the risk of crime. Therefore, the hypothesis that "lower health insurance coverage leads to higher crime rates" is partially supported.
#### The 4-year data for offense_rate_per_person is taken from the very end of each year's analysis code of the 4-year individual analysis_Sub Code.
# Define the data for each year
data = {
'2018': np.array([[0.48535984, 0.29029329, -0.18510014, 0.00266324, -0.33087903,
-0.31308267, -0.00370121, 0.46168091, 0.25004661, 0.290883,
-0.38020933, 0.30906348, 1.]]),
'2019': np.array([[0.48217214, 0.28873284, -0.21387733, 0.02525347, -0.14373621,
-0.34352648, 0.00841523, 0.45133352, 0.29352053, 0.27094604,
-0.34112612, 0.28329135, 1.]]),
'2021': np.array([[0.22063691, 0.15386244, -0.31123004, 0.17828932, -0.20761825,
-0.2850225, 0.06322003, 0.37081154, 0.24700059, 0.22471481,
-0.27419872, 0.36271081, 1.]]),
'2022': np.array([[0.30574814, 0.20988305, -0.33128763, 0.17809162, -0.22898921,
-0.1940222, 0.13694608, 0.27896848, 0.23090561, 0.1617861,
-0.19118692, 0.43910273, 1.]])
}
# Define the index
index = ['unemployment_rate', 'perc_less_than_high_school', 'perc_high_school',
'perc_higher_than_high_school', 'income_less_than_high_school',
'income_high_school', 'income_higher_than_high_school', 'poverty_rate',
'Gini_Index', 'uninsured_rate', 'SMOCAPI_rate', 'GRAPI_rate',
'offense_rate_per_person']
# Create a DataFrame with the defined index
df_individual_4_years = pd.DataFrame({year: values[0] for year, values in data.items()}, index=index)
df_individual_4_years = df_individual_4_years[df_individual_4_years.index != 'offense_rate_per_person']
# Display the DataFrame
df_individual_4_years
| 2018 | 2019 | 2021 | 2022 | |
|---|---|---|---|---|
| unemployment_rate | 0.485360 | 0.482172 | 0.220637 | 0.305748 |
| perc_less_than_high_school | 0.290293 | 0.288733 | 0.153862 | 0.209883 |
| perc_high_school | -0.185100 | -0.213877 | -0.311230 | -0.331288 |
| perc_higher_than_high_school | 0.002663 | 0.025253 | 0.178289 | 0.178092 |
| income_less_than_high_school | -0.330879 | -0.143736 | -0.207618 | -0.228989 |
| income_high_school | -0.313083 | -0.343526 | -0.285023 | -0.194022 |
| income_higher_than_high_school | -0.003701 | 0.008415 | 0.063220 | 0.136946 |
| poverty_rate | 0.461681 | 0.451334 | 0.370812 | 0.278968 |
| Gini_Index | 0.250047 | 0.293521 | 0.247001 | 0.230906 |
| uninsured_rate | 0.290883 | 0.270946 | 0.224715 | 0.161786 |
| SMOCAPI_rate | -0.380209 | -0.341126 | -0.274199 | -0.191187 |
| GRAPI_rate | 0.309063 | 0.283291 | 0.362711 | 0.439103 |
# Set up the number of rows and columns for subplots based on the number of indexes
num_vars = len(df_individual_4_years.index)
num_cols = 3 # Choose 3 columns for better spacing
num_rows = (num_vars + num_cols - 1) // num_cols # Calculate number of rows needed
# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 16))
axes = axes.flatten() # Flatten axes for easy indexing in for loop
# Loop through each index and create a bar chart for each
for i, idx in enumerate(df_individual_4_years.index):
bars = axes[i].bar(df_individual_4_years.columns, df_individual_4_years.loc[idx])
axes[i].set_title(f'{idx}')
axes[i].set_xlabel('Year')
axes[i].set_ylabel('Value')
# Add data labels on top of each bar
for bar in bars:
yval = bar.get_height() # Get the height of the bar
axes[i].text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 4),
ha='center', va='bottom') # Add text on top of the bar
# Hide any empty subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout
plt.tight_layout()
plt.show()